import jieba
import collections
import re
import random

def randomColor():
    colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
    color = ""
    for i in range(6):
        color += colorArr[random.randint(0,14)]
    return "#" + color
def makeStringForCiHuiCloud(wordCounts, fileName):
    #6 | 后来 |  # 2d8cf0|1|是
    f = open(fileName, 'w')
    default = ['老婆', '宝贝', '肥妞', '刘欣', '猪猪']
    wordSize = ['6', '5', '4', '3', '2']
    wordSum = len(wordCounts)
    step = wordSum / len(wordSize)
    count = 0
    for x in default:
        string = '7' + '|' + x + '|' + randomColor() + '|' + '1' + '|' + '是' + '\n'
        f.write(string)
    for word in wordCounts:
        count += 1
        string = wordSize[int(count / step) - 1] + '|' + word[0] + '|' + randomColor() + '|' + '1' + '|' + '是' + '\n'
        f.write(string)
    f.close()

def main():
    showWordNum = 100
    #读取文件
    f = open("民叔(578997662).txt", 'r', encoding='utf-8')
    data = f.read()
    f.close()
    #提取高频词汇
    pattern = re.compile('[\d]{4}\-[\d]{2}\-[\d]{2}.*')
    temData1 = re.sub(pattern, '', data)
    pattern = re.compile(u'=|:|\?|。|！|\n|[|]')
    temData2 = re.sub(pattern, '', temData1)
    f = open("test.txt", 'w', encoding='utf-8')

    seg_list_exact = jieba.cut(temData2, cut_all=False)
    object_list = []
    remove_words = [u'的', u'，', u'和', u'是', u'随着', u'对于', u'表情', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了',
                    u'通常', u'如果', u'我', u'需要', u'你', u'啊', u'不', u'说', u'就', u'吗', u'[', u']', u'？', '图片', '哦']  # 自定义去除词库

    for word in seg_list_exact:  # 循环读出每个分词
        if word not in remove_words:  # 如果不在去除词库中
            object_list.append(word)  # 分词追加到列表
    word_counts = collections.Counter(object_list)
    word_counts_top10 = word_counts.most_common(200)
    makeStringForCiHuiCloud(word_counts_top10, "save.txt")
    print(word_counts_top10)

if __name__ == "__main__":
    main()